{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "plt.rcParams['font.sans-serif']=['SimHei']\n",
    "plt.rcParams['axes.unicode_minus'] = False\n",
    "import seaborn as sns; sns.set()\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>content</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   category                                            content\n",
       "0         2  合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...\n",
       "1         2  公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。\n",
       "2         1  公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...\n",
       "3         2  公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...\n",
       "4         2  该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专..."
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data=pd.read_csv(\"./training.csv\",names=['category','content'],encoding='utf-8',header=None)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4774, 2)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. 分词（20分）：由于企业描述是文本信息，需要对文本信息进行特征提取。文本分词可采用Jieba分词："
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第一部分：分词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "contents=data.content.values.tolist()# 使用jieba分词器要求传入的数据格式为list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\Administrator\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 1.596 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "import jieba\n",
    "content_s=[]\n",
    "for line in contents:\n",
    "    current_segment=jieba.lcut(line)\n",
    "    if len(current_segment)>1 and current_segment!=\"\\n\":\n",
    "        content_s.append(current_segment)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4774,)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.array(content_s).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>content_s</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[合晟, 资产, 是, 一家, 专注, 于, 股票, 、, 债券, 等, 二级, 市场, 投...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[公司, 的, 主营业务, 为, 向, 中小, 微, 企业, 、, 个体, 工商户, 、, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[公司, 立足于, 商业地产, 服务, ，, 致力于, 为, 商业地产, 开发, 、, 销售...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[公司, 经, 工商管理, 部门, 核准, 的, 经营范围, 为, “, 投资, 咨询, 、...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[该, 公司, 的, 主营业务, 为, 在, 中国, 境内, (, 港, 、, 澳, 、, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           content_s\n",
       "0  [合晟, 资产, 是, 一家, 专注, 于, 股票, 、, 债券, 等, 二级, 市场, 投...\n",
       "1  [公司, 的, 主营业务, 为, 向, 中小, 微, 企业, 、, 个体, 工商户, 、, ...\n",
       "2  [公司, 立足于, 商业地产, 服务, ，, 致力于, 为, 商业地产, 开发, 、, 销售...\n",
       "3  [公司, 经, 工商管理, 部门, 核准, 的, 经营范围, 为, “, 投资, 咨询, 、...\n",
       "4  [该, 公司, 的, 主营业务, 为, 在, 中国, 境内, (, 港, 、, 澳, 、, ..."
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_content=pd.DataFrame({'content_s':content_s})\n",
    "df_content.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第二部分：去掉停用词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stopwords</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>book</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  stopwords\n",
       "0      book\n",
       "1       NaN"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 导入停用词库\n",
    "stopwords=pd.read_csv('./stopwords.txt',index_col=False,sep='\\t',quoting=3,names=['stopwords'],encoding='gbk')\n",
    "stopwords.head()\n",
    "# pandas.read_csv参数整理：https://www.cnblogs.com/datablog/p/6127000.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 去掉停用词\n",
    "# 自定义一个函数\n",
    "def drop_stopwords(contents,stopwords):\n",
    "    content_clean=[]\n",
    "    all_words=[]\n",
    "    for line in contents:\n",
    "        line_clean=[]\n",
    "        for word in line:\n",
    "            if word  in stopwords:\n",
    "                continue\n",
    "            line_clean.append(word)\n",
    "            all_words.append(str(word))\n",
    "        content_clean.append(line_clean)\n",
    "    return content_clean,all_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "contents=df_content.content_s.values.tolist()\n",
    "stopwords=stopwords.stopwords.values.tolist()\n",
    "content_clean,all_words=drop_stopwords(contents,stopwords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>contents_clean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[合晟, 资产, 是, 一家, 专注, 于, 股票, 、, 债券, 等, 二级, 市场, 投...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[公司, 的, 主营业务, 为, 向, 中小, 微, 企业, 、, 个体, 工商户, 、, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[公司, 立足于, 商业地产, 服务, ，, 致力于, 为, 商业地产, 开发, 、, 销售...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[公司, 经, 工商管理, 部门, 核准, 的, 经营范围, 为, “, 投资, 咨询, 、...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[该, 公司, 的, 主营业务, 为, 在, 中国, 境内, (, 港, 、, 澳, 、, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                      contents_clean\n",
       "0  [合晟, 资产, 是, 一家, 专注, 于, 股票, 、, 债券, 等, 二级, 市场, 投...\n",
       "1  [公司, 的, 主营业务, 为, 向, 中小, 微, 企业, 、, 个体, 工商户, 、, ...\n",
       "2  [公司, 立足于, 商业地产, 服务, ，, 致力于, 为, 商业地产, 开发, 、, 销售...\n",
       "3  [公司, 经, 工商管理, 部门, 核准, 的, 经营范围, 为, “, 投资, 咨询, 、...\n",
       "4  [该, 公司, 的, 主营业务, 为, 在, 中国, 境内, (, 港, 、, 澳, 、, ..."
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_content1=pd.DataFrame({'contents_clean':content_clean})\n",
    "df_content1.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. 特征提取（20分）： 去掉停用词后（stopwords.txt），采用TFIDF作为每个文本的特征描述。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'合晟 资产 是 一家 专注 于 股票 、 债券 等 二级 市场 投资 ， 为 合格 投资者 提供 专业 资产 管理 服务 的 企业 。 公司 业务范围 包括 资产 管理 、 投资 咨询 和 投资 顾问 服务 。 公司 管理 的 私募 基金 产品 主要 包括 股票 型 、 债券 型 资产 管理 计划 或 证券 投资 基金 ， 管理 总资产 规模 80 亿元 左右 。 根据 中国 证券 投资 基金业 协会 数据 ， 公司 管理 的 私募 证券 投资 基金 （ 顾问 管理 ） 类 规模 较大 ， 公司 管理 规模 处于 50 亿元 以上 的 第一 梯队 。'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 准备数据\n",
    "X_train=df_content1.contents_clean\n",
    "words=[]\n",
    "for line_index in range(len(X_train)):\n",
    "    try:\n",
    "        words.append(' '.join(X_train[line_index]))# 将数据格式转化成字符串的格式，因为TfidfVectorizer算法要求是这样的格式\n",
    "    except:\n",
    "        print(line_index)\n",
    "words[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入工具包,构建tfidf特征\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "Vectorizer=TfidfVectorizer(analyzer='word',max_features=30,lowercase=False)\n",
    "X=Vectorizer.fit_transform(words).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4774, 30)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>专业</th>\n",
       "      <th>业务</th>\n",
       "      <th>主营业务</th>\n",
       "      <th>主要</th>\n",
       "      <th>产品</th>\n",
       "      <th>从事</th>\n",
       "      <th>以及</th>\n",
       "      <th>企业</th>\n",
       "      <th>公司</th>\n",
       "      <th>制造</th>\n",
       "      <th>...</th>\n",
       "      <th>管理</th>\n",
       "      <th>系统</th>\n",
       "      <th>经营</th>\n",
       "      <th>行业</th>\n",
       "      <th>解决方案</th>\n",
       "      <th>设备</th>\n",
       "      <th>设计</th>\n",
       "      <th>通过</th>\n",
       "      <th>销售</th>\n",
       "      <th>领域</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.098809</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.067842</td>\n",
       "      <td>0.066472</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.075507</td>\n",
       "      <td>0.175687</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.938241</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.644582</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.353977</td>\n",
       "      <td>0.205906</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.178533</td>\n",
       "      <td>0.468189</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.187356</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.158719</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.211907</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.196929</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.342915</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.387337</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.212710</td>\n",
       "      <td>0.494927</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.330390</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.304954</td>\n",
       "      <td>0.25096</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.307038</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.203924</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.197175</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.260567</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.314507</td>\n",
       "      <td>0.562958</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 30 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         专业        业务      主营业务        主要        产品   从事        以及        企业  \\\n",
       "0  0.098809  0.000000  0.000000  0.067842  0.066472  0.0  0.000000  0.075507   \n",
       "1  0.000000  0.000000  0.644582  0.000000  0.000000  0.0  0.000000  0.353977   \n",
       "2  0.178533  0.468189  0.000000  0.000000  0.000000  0.0  0.187356  0.000000   \n",
       "3  0.000000  0.000000  0.387337  0.000000  0.000000  0.0  0.000000  0.212710   \n",
       "4  0.000000  0.000000  0.203924  0.000000  0.197175  0.0  0.000000  0.000000   \n",
       "\n",
       "         公司   制造  ...        管理   系统        经营       行业  解决方案   设备        设计  \\\n",
       "0  0.175687  0.0  ...  0.938241  0.0  0.000000  0.00000   0.0  0.0  0.000000   \n",
       "1  0.205906  0.0  ...  0.000000  0.0  0.000000  0.00000   0.0  0.0  0.000000   \n",
       "2  0.158719  0.0  ...  0.211907  0.0  0.000000  0.00000   0.0  0.0  0.196929   \n",
       "3  0.494927  0.0  ...  0.330390  0.0  0.304954  0.25096   0.0  0.0  0.307038   \n",
       "4  0.260567  0.0  ...  0.000000  0.0  0.000000  0.00000   0.0  0.0  0.000000   \n",
       "\n",
       "         通过        销售   领域  \n",
       "0  0.000000  0.000000  0.0  \n",
       "1  0.000000  0.000000  0.0  \n",
       "2  0.000000  0.342915  0.0  \n",
       "3  0.000000  0.000000  0.0  \n",
       "4  0.314507  0.562958  0.0  \n",
       "\n",
       "[5 rows x 30 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_train=pd.DataFrame(columns=Vectorizer.get_feature_names(),data=X)\n",
    "new_train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. 采用KMeans聚类算法，根据第2 步得到特征对企业进行聚类， 尝试K=5，10，15，20，30，..., 50, 并选择合适的度量指标，选择最佳的K。（60分）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入相应工具包\n",
    "from sklearn.preprocessing import normalize\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "#读取数据\n",
    "x_train=new_train\n",
    "# y_train=data.category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.09880911, 0.        , 0.        , ..., 0.        , 0.        ,\n",
       "        0.        ],\n",
       "       [0.        , 0.        , 0.64458154, ..., 0.        , 0.        ,\n",
       "        0.        ],\n",
       "       [0.1785328 , 0.46818892, 0.        , ..., 0.        , 0.34291453,\n",
       "        0.        ],\n",
       "       ...,\n",
       "       [0.43066352, 0.        , 0.        , ..., 0.        , 0.13786521,\n",
       "        0.        ],\n",
       "       [0.        , 0.13129257, 0.        , ..., 0.16116846, 0.        ,\n",
       "        0.151351  ],\n",
       "       [0.18577683, 0.16239528, 0.12925633, ..., 0.        , 0.        ,\n",
       "        0.18720548]])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 因为要计算样本之间的距离\n",
    "# 对每个样本数据进行归一化，每个样本的模长为1.\n",
    "normalize(x_train, norm=\"l2\", copy=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# KMeans聚类\n",
    "# 一个参数点（聚类数据为K）的模型\n",
    "def K_cluster_analysis(K, X):\n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    #K-means,在训练集上训练\n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    y_pred = mb_kmeans.fit_predict(X)# 这里是fit_predict方法\n",
    "    \n",
    "    # K值的评估标准\n",
    "    #本案例中训练数据有标签，可采用有参考模型的评价指标\n",
    "    #v_score = metrics.v_measure_score(y_val, y_val_pred)\n",
    "    \n",
    "    #亦可采用无参考默的评价指标：轮廓系数Silhouette Coefficient和Calinski-Harabasz Index\n",
    "    #这两个分数值越大则聚类效果越好\n",
    "    CH_score = metrics.calinski_harabaz_score(X, y_pred)\n",
    "    \n",
    "    #轮廓系数Silhouette Coefficient在大样本时计算太慢\n",
    "    #si_score = metrics.silhouette_score(X, y_pred)\n",
    "    \n",
    "    print(\"CH_score: {}\".format(CH_score))\n",
    "    #print(\"si_score: {}\".format(si_score))\n",
    "    \n",
    "    return CH_score#,si_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 5\n",
      "CH_score: 293.7257442867197\n",
      "K-means begin with clusters: 10\n",
      "CH_score: 230.30015476297152\n",
      "K-means begin with clusters: 15\n",
      "CH_score: 187.60046377498855\n",
      "K-means begin with clusters: 20\n",
      "CH_score: 167.31353188861286\n",
      "K-means begin with clusters: 25\n",
      "CH_score: 145.86785333423654\n",
      "K-means begin with clusters: 30\n",
      "CH_score: 130.3734227396758\n",
      "K-means begin with clusters: 35\n",
      "CH_score: 115.95026602770382\n",
      "K-means begin with clusters: 40\n",
      "CH_score: 106.64899616628902\n",
      "K-means begin with clusters: 45\n",
      "CH_score: 97.29234843061964\n",
      "K-means begin with clusters: 50\n",
      "CH_score: 91.11986923454393\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "Ks = [5,10,15, 20, 25,30,35,40,45,50]\n",
    "CH_scores = []\n",
    "#si_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, x_train)\n",
    "    CH_scores.append(ch)\n",
    "    #si_scores.append(si)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEBCAYAAACQbKXWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xl81OW59/HPTHaSkIUEspNAyAWEJWEXZFPEUmndarV4bK11e7Q97XPU9jytPR7b57TP0VPbY2s3leopp3UBQS2iqIDsi+yC3GwJJCHsyL6H548ZIOlRmYQkM5n5vl+vvMzcc/+Si8vJNb+579/vvj3nzp1DREQiizfYAYiISOtT8RcRiUAq/iIiEUjFX0QkAqn4i4hEIBV/EZEIpOIvIhKBVPxFRCKQir+ISARS8RcRiUAq/iIiESg62AHUEwcMBGqBs0GORUSkrYgCsoFlwMlADwqo+JvZT4CvAOeA551zT5nZGOApIAF42Tn3qL9vGfAc0B6YC9zvnDsTwK8ZCMwLNHAREWlgODA/0M6XLP5mNhK4CugDxADrzex9YCIwEqgCppvZOOfcDGAScLdzbrGZPQ/cA/wugFhqAQ4cOEpdXdtdabRDhyT27TsS7DBChvJxkXLRkPLRUFPz4fV6SEtLBH8NDdQli79z7gMzG+2cO2Nmuf5jUoFNzrkKADObBNxiZuuBBOfcYv/hLwCPE1jxPwtQV3euTRd/oM3H39yUj4uUi4aUj4YuMx+NGi4PaMLXOXfazB4H1gPvAzk0fJepBfI+p11EREJIwBO+zrnHzOzfgTeBEnzj/+d5gDp8byaf1h6wDh2SGtM9JGVmJgc7hJCifFykXDSkfDTUmvkIZMy/OxDvnFvlnDtmZq/hm/yt/xEjC9gBVOObdf779oDt23ekTX8UzMxMZs+ew8EOI2QoHxcpFw0pHw01NR9er6dJJ82BDPt0AZ41szgziwWuB/4AmJkVm1kUMAGY4ZzbBpwws2H+Y+8AZjQ6KhERaVGXLP7OubeA6cBKYDmw0Dn3EnAnMAXfPMAGYLL/kNuBX5rZBiAJeLr5wxYRkcvhCaEN3AuBiqYO+5w7dw6Px9PsQTWWPso2pHxcpFw0pHw01AzDPkVAZcDHNfo3haB1lft59LklHD52KtihiIi0CWFR/NOS4ti5/xhvLqwMdigiIm1CWBT/nIxEhvfJYfaKGnYfOBbscEREQl5YFH+A668sIirKw2tztwY7FBGRkBc2xT8tOY5rBxaw9OPdVNQeCnY4IiIhLWyKP8AXBheQ3C6GV2ZtJoSuYhIRCTlhVfwT4qK5/soiXNUnrNmyL9jhiIiErLAq/gAj+ubQKS2BV+ds4Wxdo5YVEhGJGGFX/KOjvNw8sis79h5lwdqdwQ5HRCQkhV3xB+hvmXTNac+0eVs5eUo7QoqI/L2wLP4ej4dbRhfzyZFTzPywKtjhiIiEnLAs/gAl+amUd8tgxuJtHDqqZR9EROoL2+IP8JVRXTl1uo43F1QGOxQRkZAS1sU/u0MiI8pymLOqhl37teyDiMh5YV38Aa4fVkh0lJcpWvZBROSCgPbwNbPHgK/6H04H5gA/q9clF1jinBvv73sXcMD/3LPOuWeaJ9zGS0mK4wuDC3h9fgVbdhyka05KsEIREQkZgezhOwYYC5Tj25z9bWCRc67M/3wWsAD43/5DBgC3OecWtUjETXDtoHxmr6zh1Vmb+cHt/UJi0xcRkWAKZNinFnjIOXfKOXca+BgoqPf8k8DvnXOb/I8HAD80szVm9hszi2/ekBsvPta37MPG6oOs2rw32OGIiARdIHv4rnPOLQYws274hn/eqvd4FP59es0sCd9ev48A/YBU4MctEXhjDe+TTVZ6OyZr2QcRkcD38DWzUnzj/Y855170tz0J7HfO/fwzjikHJjrnygP4FYVARUDBNNGitbX87IWlPPiVvnzhisKW/FUiIq2tUXv4BjrhOwyYAnzPOfdSvaduwDcfcL5fATDGOTfR3+QBTgcaDEBTN3APRNdOiRTnpTBpxseUFqQQHxvQP79RtCl1Q8rHRcpFQ8pHQ82wgXvjjrtUBzPLB6YBE+oXfjPLABKcc/XP1o8DT5hZkZl5gAeBqY2OqoV4PB6+OrqYg0dPMXOpln0QkcgVyKnvw0A88JSZnW/7PbACqK7f0Tm3x8zuA94EYoH5wC+aLdpmUJybQn/LZMaS7YwszyUlMTbYIYmItLpLFn/n3HeB737G00M+pf8UfENEIevmkV1ZtWkvb8yv4I5r7dIHiIiEmbC/w/fTZKW3Y2RZDh+s2kHtvqPBDkdEpNVFZPEH+PKwImJivLz2gZZ9EJHIE7HFv31iLF8cXMDyjXvYXH0w2OGIiLSqiC3+AGMHFpCSFMsrszcT6P0OIiLhIKKLf1xsFDdcWcTmmoOs2KhlH0QkckR08Qe4sk822R3aMfmDLZw5q2UfRCQyRHzxj/J6uWVUMbv2H2Pe6h3BDkdEpFVEfPEH6FvcgZL8VF6fX8Hxk2eCHY6ISItT8efisg+Hjp3mnaXbgx2OiEiLU/H365LTnoHdO/L20u18cuRksMMREWlRKv713DyyC2fPnuON+S26srSISNCp+NfTMa0do8tzmbu6lh17teyDiIQvFf+/M35YIbExXqZ8sCXYoYiItBgV/7/Tvl0sXxzSmZWb9rKx6pNghyMi0iJU/D/FNQPzSdWyDyISxlT8P0VcTBQ3Du/C1h2HWO72BDscEZFmF+gevo8BX/U/nO6c+76Z/Qm4Ejg/M/q4c26qmY0BngISgJedc482d9CtYVjvbGYuq2LyB1so65ZBdJTeJ0UkfASyh+8YfJu0lwNlQH8zuxEYAIxwzpX5v6aaWQIwEbge6AEMNLNxLRd+y/F6Pdwyuiu7Dxzng1Va9kFEwksgp7O1wEPOuVPOudPAx0CB/2uima0xs8fNzAsMAjY55yqcc2eAScAtLRV8S+vdpQPdC7Tsg4iEn0D28F13/nsz64Zv+Gc4MAp4ADgI/A34FnAE35vFebVAXmMC6tAhqTHdW9y9N/Xhn341lw/W7uSOcT0COiYzM7mFo2pblI+LlIuGlI+GWjMfAY35A5hZKTAdeMQ554Ab6z33a+DrwGSg/uUxHqBR6yTv23eEurrQucImNT6awT07MW3OZgZbJmnJcZ/bPzMzmT17DrdSdKFP+bhIuWhI+Wioqfnwej1NOmkOaBbTzIYB7wP/7Jx70cx6m9nN9bp4gNNANZBdrz0LaPMD5jeN6MLZunO8Pl/7/YpIeAhkwjcfmAZMcM695G/2AL8yszQziwHuBaYCS3yHWLGZRQETgBktE3rryUxN4Kp+ecxbU0vNniPBDkdE5LIFcub/MBAPPGVmq8xsFTAU+DmwAFgPrHLO/dU5dwK4E5jib9+AbyiozfvSsELiY6OZPEfLPohI2xfIhO93ge9+xtO//ZT+7wN9LzOukJOUEMN1V3Rm8pwtbNh2gO6d04IdkohIk+nOpUYY0z+PtOQ4Xpm9mTot+yAibZiKfyPExkRx04guVO48zIcbdgc7HBGRJlPxb6QrSrPIy0xi8pwtnD7TqKtYRURChop/I3m9Hr46uit7D55gzsqaYIcjItIkKv5NUFqUTs/CNN5cWMmxE1r2QUTaHhX/JvB4PNwyqpgjx08zY8m2YIcjItJoKv5N1DkrmSGlnZi5rIr9h04EOxwRkUZR8b8MNw3vwrlz55g2ryLYoYiINIqK/2XISE3g6v55LFhbS/VuLfsgIm2Hiv9luu6KQhLionlVyz6ISBui4n+ZkhJiGD+0kLVb97G+cn+wwxERCYiKfzO4un8uHdrH8ersLSG1F4GIyGdR8W8GMdFR3DSiK9t2HWbuKt34JSKhT8W/mQwu7URBxyT+9OY6Dh49FexwREQ+l4p/M/F6PNx1XQ+OHD/NM1PXcuas1v0RkdAV0B6+ZvYYvo3bAaY7575vZvcC/4hvz94Pgfucc6f8fe8CDvj7P+uce6aZ4w5JBZ2S+d6t5Twx6UMmzdzIN75geDyeYIclIvI/XLL4m9kYYCxQjq/Qv21mPwDuBvoDh4EXgAeBXwIDgNucc4taKOaQNrw8l3Vb9jB90TY6d0pidL+8YIckIvI/BHLmXws85Jw7BWBmH+Pb1vEB59whf9taoMDffwDwQzPrDMwFHvZv7xgxbhzehardR/jLe5vIyUjECrTrl4iElkuO+Tvn1jnnFgOYWTd8wz9/cc6962/LBL4NvG5mScBK4BGgH5AK/LiFYg9ZXq+He79USmZqAs9M/Yi9B48HOyQRkQY85wLcjtDMSoHpwGPOuRf9bbnADOBV59xPP+WYcmCic648gF9RCITVIjnVuw/z0H/OJSs9kX//zpXExwY0xSIi0hRFQGWgnQOd8B0GTAG+55x7yd/WHXgHeNo59wt/WwEwxjk30X+oBzgdcOjAvn1H2vSNUpmZyezZcxiAOA/c+6We/Oera3jyv5Zx35dLI24CuH4+Ip1y0ZDy0VBT8+H1eujQIanxx12qg5nlA9OACfUKfzIwE3j0fOH3Ow48YWZFZubBNwk8tdFRhZE+XTO4eVRXln68m7cWa+1/EQkNgZz5P4xvgvcpMzvf9jLQCXjIzB7yt73hnPsXM7sPeBOIBeYDvyDCjRtcwPZdh3ntg63kZSbRtzgj2CGJSIQLeMy/FRQCFeE07FPfydNn+fmk5ez55DiPfn0A2R0SgxBd69NH+4uUi4aUj4aaYdinUWP+usO3lcTFRPGdm/oQHeXl6SlrOXaiUVMhIiLNSsW/FXVIieeBG3qx95Pj/PHN9W36E46ItG0q/q3MCtKYcE0Ja7bs47W5W4MdjohEKF14HgSjy3Op2nWYtxZvI69jIkN6ZgU7JBGJMDrzD5IJ15TQLS+FF97awLadmvQSkdal4h8k0VFeHrixN4kJMfzmtTUc0h4AItKKVPyDKCUxlu/c3JtDx07zW+0BICKtSMU/yAqz2vPNcd3ZWH2Qv763KdjhiEiE0IRvCBhSmkXV7iPMWLKd/I5JjCrPDXZIIhLmdOYfIm4e2ZVeXdL573c3srHqk2CHIyJhTsU/RHi9Hu7/cikZKfH8dupa9h+KqP1vRKSVqfiHkHbxMXzn5j6cOlPHr6es5eTps8EOSUTClIp/iMnJSOTeL5WyfddhXpyxgRBaeE9EwoiKfwgq65bBjSO6sHj9Lt5euj3Y4YhIGFLxD1HXXdGZAd07Mnn2FtZu3RfscEQkzKj4hyiPx8O3vtiDvI5J/P71dezcfyzYIYlIGAmo+JvZY2a2zv/1hL9tjJmtMbNNZvZ/6/UtM7MPzWyjmT1nZrqXoIniYqP4zk29ifJ6+PWUNRw/eSbYIYlImAhkD98xwFigHCgD+pvZ14CJwPVAD2CgmY3zHzIJ+LZzrgTfBu73tETgkSIjNYEHbujFrv3H+eMb66jTBLCININAzvxrgYecc6ecc6eBj4ESYJNzrsI5dwZfwb/FzDoDCc65xf5jXwBuaYG4I0r3zml8bUw3Vm/Zx7R52gNARC7fJYdknHPrzn9vZt2ArwK/xvemcF4tkAfkfEZ7wPx7UbZpmZnJzf4zb722O3sOneRvC7fRs2smw8vazhIQLZGPtkq5aEj5aKg18xHweLyZlQLTgUeAM/jO/s/zAHX4Pkmc+5T2gIXrBu7N4ebhRWyt/oRfvbSCdtEeCjqF/h+ONum+SLloSPloqBk2cG/ccYF0MrNhwPvAPzvnXgSqgex6XbKAHZ/TLs0gJtrLgzf2IjE+hl9PWcvhY9oDQESaJpAJ33xgGjDBOfeSv3mJ7ykrNrMoYAIwwzm3DTjhf7MAuAOY0QJxR6yUpDi+fVNvDh49xe+mfaQ9AESkSQI5838YiAeeMrNVZrYKuNP/NQVYD2wAJvv73w780sw2AEnA080cc8Qrym7PneOMDds/4eX3Nwc7HBFpgwKZ8P0u8N3PeLrvp/RfDQy6zLjkEob2yqZq9xHeWVpFfqckRvTNCXZIItKG6A7fNuwro7pSWpTOn99xbK4+GOxwRKQNUfFvw6K8Xu6/vpQOKfH8RnsAiEgjqPi3cYn+PQBOnj7Lb15byyntASAiAVDxDwO5GYncO74nlTsP8+Lb2gNARC5NxT9MlJdkcsPwIhat28XMZVXBDkdEQpxW3Awj44cWUrX7CK/M3kxmagL9SjKDHZKIhCid+YcRr8fDt67rQX5mEr95bS2/nfYRew8eD3ZYIhKCVPzDTHxsND+8oz83DC9izea9/OjZJbw+v0ITwSLSgIZ9wlBsTBRfHlbEsF7ZvDJ7M6/Pr2D+mlpuvaqY/paJx+MJdogiEmQ68w9jHVLi+V839OL7XysnIS6K3077iP94aRXVe44EOzQRCTIV/wjQvXMaj31zIP8wtoTtuw7zrxOX8d/vbuToidPBDk1EgkTDPhEiyuvlqn55DOrRialztzJrRTVL1u/ippFdGNEnB69XQ0EikURn/hEmKSGGO641HrtzIDkZifzX246fvLiMTdWfBDs0EWlFKv4RqqBTMj+YUM7915dy+Nhpfj5pBX98Yx0HDp8Mdmgi0go07BPBPB4Pg3p0om/XDKYv3sbbS7azctNexg/tzNiB+cRERwU7RBFpIY3Zw7c9sBAYD/QEflbv6VxgiXNuvJk9BtwFHPA/96xz7plmildaQFxsFDeN6MKVfbJ5ZdZmpnywlXmra7nt6m70Le6gS0NFwlBAxd/MBgPP4t+03Tn3FvCW/7ksYAHwv/3dBwC3OecWNXu00qI6pibw7Zt6s65iP395byNPT1lDry7pfO3qbmR3SAx2eCLSjAId878HeJBP34z9SeD3zrlN/scDgB+a2Roz+42ZxTdDnNKKSovSefyuQdx2dTe21BzkX55fysuzNnH85JlghyYizSSg4u+cu9s5N+/v282sGzAK/z69ZpYErAQeAfoBqcCPmytYaT3RUV7GDszn5/dewdBeWcxcWsX/+eNi5q+ppU5LRou0eZ7GrP1uZpXAKOdcpf/xk8B+59zPP6N/OTDROVcewI8vBCoCDkZa1aaqA/xh6lrctgOUFKRy3419KClIC3ZYInJREVAZaOfLvdrnBmDs+QdmVgCMcc5N9Dd5gEbdRrpv3xHq6trumWVmZjJ79hwOdhjNLjU+mkduK2Pxup28OnsLD/3nXIb1zuIrI7uSkhT3mceFaz6aQrloSPloqKn58Ho9dOiQ1Ojjmlz8zSwDSHDO1T9bPw48YWaz8b0DPQhMbervkNDi9XgY2iub8m6Z/G1hJTOXVbHc7eHLw4oYMyCP6CjdNiLSVlzOX2sXoLp+g3NuD3Af8Cbg8J35/+IyfoeEoIS4aG4ZXcxP7x5MSX4qr8zezL88v5S1W/cFOzQRCVCjxvxbWCFQoWGftmf15r289P4mdh04TllxBrddXUzHtHZAZObjsygXDSkfDTXDsE+rjvmL0Lc4g56F6bz3YRVvLKzk0eeWcO2gAq67onOwQxORz6DiL80iJtrLuCGduaJXFpPnbGH6om0sWFvLvTf0wXKTdZewSIjRDJ00q9SkOO4e35Mf3tGflMQ4npj0IU+9sppdB44FOzQRqUfFX1pEcW4KP/7GAO67sTdbag7y4+eW8uaCCk6fqQt2aCKChn2kBXm9HsZf2YWSnPb89f1NTJ1XweL1u7hjrNG9s24QEwkmnflLi0tLjuOBG3rxvVv6cvpMHU/8dSXP/W09h46dCnZoIhFLxV9aTZ+uHfjp3YO57orOLFm/ix/9cTFzV+/QWkEiQaDiL60qLiaKm0d25V/vGkRuZhIvzNjA/5u0gurdR4IdmkhEUfGXoMjNSOQHE8q564s92Ln/GI+/sIxXZ2/m5KmzwQ5NJCJowleCxuPxcGWfbMq6ZfDK7M3MWLKdpR/v5vaxJZQVZwQ7PJGwpjN/CbqkhBju+mIP/vn2fsTFRvH05DX85rW17D90ItihiYQtFX8JGSX5qfzrNwdy88gufLR1Hz96bgkzl27nbJ3uDRBpbir+ElKio7xcd0Whb8XQvFRemrWZn77wIVt3HAp2aCJhRcVfQlJmagLfu6UPD9zQi0PHTvFv//Uhf37HcexEo/YGEpHPoAlfCVkej4cB3TtSWpTO1HlbeX95Ncs37uG2q4sZ3KOTFosTuQwBF38zaw8sBMY75yrN7E/AlcBRf5fHnXNTzWwM8BSQALzsnHu0uYOWyJIQF82EMSUM65XNi29v4I9vrGfBmlr+YazRKb1dsMMTaZMCGvYxs8HAfKCkXvMAYIRzrsz/NdXMEoCJwPVAD2CgmY1r7qAlMnXOSubRrw/g9mtK2Fp7iB8/v5Q35muxOJGmCPTM/x58+/H+GcDM2gEFwEQzy8W3T+/jwCBg0/l9fc1sEnALMKOZ45YI5fV6uLp/Hv1KMnl51iamza9g0fpdfH1sCT0K04MdnkibEdCZv3PubufcvHpNWcAs4C5gCDAc+BaQA9TW61cL5DVPqCIXpSXHcf/1vfinr/alrq6OJ19axbNvruPQUS0WJxKIJk34Oue2Ajeef2xmvwa+DkwG6q/S5QEa9Zncvxdlm5aZmRzsEEJKS+ZjdGYyQ/vl8+p7G5kyexNrtu7nzut6MnZwZ7ze0JsQ1mujIeWjodbMR5OKv5n1Bkqcc1P8TR7gNFANZNfrmgXsaMzP1gbu4aW18nHtgDx6F6bx53ccz0xezduLKvj6td3J7xg6JxN6bTSkfDTUDBu4N+64Rh/h4wF+ZWZpZhYD3Itv3H8JYGZWbGZRwAQ03i+tJCcjke9PKOdb1/Vg1/7jPP6nZbwyS4vFiXyapg77rDGznwMLgBhginPurwBmdicwBYgH3sI3FCTSKjweD8N6Z9O3OIPJczbz9tLtLN2wi+uuKGRoryziYqKCHaJISPCcC52NNAqBCg37hJdg52NT9Sf89b1NVO48TFJCDKPKc7m6Xy4pSXGtHkuwcxFqlI+GmmHYpwioDPQ43eErYa1bXio//sYANlZ9wsxlVUxfWMnbS7YxpGcWYwfmkxdCcwIirUnFX8Kex+PBCtKwgjR27j/Gux9WsWBNLfPX1lJalM61A/MpLUrXchESUVT8JaJkpbfjjrHGjcO7MGdlDe8vr+apV1aTm5HI2IH5DCnNIiZa6x1K+FPxl4iUlBDD+KGFXDuogKUf7+KdpVX8acYGpszdylX9chldnktyu9hghynSYlT8JaLFRHsZ1jubob2yWL/tADOXVjFtXgXTF21jWK8srhmYT3aHxGCHKdLsVPxF8M0LlBamU1qYTs3eo7y7bDvz1+5kzqod9O3agbGDCuhekKp5AQkbKv4ifyc3I5E7x/XgxhFdmb2imlkranjyrysp6JTEtQMLGNijI9FRmheQtk3FX+QzpCTGcsPwLnxxSGcWrdvJzGVVPPu39Uz+YAtX989jZFkOifExwQ5TpElU/EUuITYmipFluQzvm8NHW/fxztIqJs/ZwpsLKrmyTzbXDMijY5o2lZG2RcVfJEBej4c+XTPo0zWD7bsOM3NZFXNW1jBreTX9SjIZOyif4twUzQtIm6DiL9IEBZ2SuXt8T24e2ZVZK6qZs7KG5Rv30CWnPWMH5tPfMonyal5AQpeKv8hlSEuO4+aRXRl/RSHz19by7rIqfv/6Ojq0j+eaAXkM75tDQpz+zCT06FUp0gziYqO4un8eo8tzWbV5LzOXbuelWZt5fUEFI/rmMKZ/vjYukZCi4i/SjLxeD/1KMulXkklF7SHeWbqdd5dV8+6yaob1zeHKXp00LyAhQcVfpIUUZbfn/ut7sW/UCd5bXsW8NbXMW1VDfsckRvfLZUjPTsTH6k9QgkPr+TczrVHekPJxUXL7BN6cu5lZy2uo3nOEhLgohvXKZnS/3IhcQkKvjYZCdj1/M2sPLATGO+cqzexe4B/xbdj+IXCfc+6UmT0G3AUc8B/6rHPumUB/j0i4io+LZlRZLiP75rCl5hCzVlQze2UN7y2vpkfnNK7ql0tZtwxdJSStIqDib2aDgWeBEv/jEuARoD9wGHgBeBD4JTAAuM05t6gF4hVp8zweD8V5KRTnpXDr1d2Yt3oHc1bV8MzUj0hLjmNUWQ4j+uYEZbcxiRyBnvnfg6+4/9n/+CTwgHPuEICZrQUK/M8NAH5oZp2BucDDzrkTzReySPhISYxl/NBCxg0pYM3mfcxaWcPUeRW8saCS/pbJVf3y6JanCWJpfo0a8zezSmCUc66yXlsmsAy4E9/wzyvAPwGb8X0i2Oac+1EAP74QqAg4GJEwVbPnCG8trOD9pds5euIMhdnt+eLQQkb1z9c9A/J5GjXmf1nF38xygRnAq865n35K/3JgonOuPIAfX4gmfMOO8nFRY3Nx8tRZlny8i1nLq9m++wjxsRcniHMy2v4EsV4bDYXshO/fM7PuwDvA0865X/jbCoAxzrmJ/m4e4HRTf4dIJIuLjWJE3xyG98lmy45DzF5RzQera3h/RTXdC1K5ql8eZd0ytLy0NEmTir+ZJQMzgR855/5c76njwBNmNhvfO9CDwNTLDVIkknk8HopzUyjOTeHWq7oxb80O5qzcwW+nfURqUiyjynIZUZZDqiaIpRGaeuZ/N9AJeMjMHvK3veGc+xczuw94E4gF5gO/uPwwRQSgfWIs111RyLjBnVmzZR+zVlQzbX4Fby6spF9JJlf1y6UkXzuOyaXpJq9mpnHMhpSPi1oqF7sOHGP2ihrmr6nl2Mkz5GYmclV5LkNKs0J6glivjYbazJi/iISGTmntuO3qbtw4ogtL1+9i1ooa/jxzI6/M2cLQXllcVZ5LbmZSsMOUEKPiLxIm4mKiGN43hyv7ZLO19hCzV9Qwb3Uts1fU0L0gldH98ijXBLH4qfiLhBmPx0PXnBS65qRw61XFzF9Ty+yVNfxu2kekJMVyZe9senfpQJec9nojiGAq/iJhLLldLOOGdObaQQWs3bqPWStqeGvxNqYv2kZcbBSWn0ppYTo9i9LJ6dBOE8URRMVfJAJ4vR76FmfQtziDYydOs2H7J6yr3M/6iv2s2bIPgNSkWHoWplNamE6PwjRdOhrmVPxFIky7+JgLG84A7D14nPWVB1hf6XsjWPj0sMLZAAAJu0lEQVTRTgByMxN9nwoK07D8NOJio4IZtjQzFX+RCJeRksCIvgmM6JtD3blzVO06wvrK/ayr3M+sFTXMXFZFlNd3o1nPIt+bQVFWe7xeDRG1ZSr+InKB1+Ohc1YynbOSGTekM6dOn2VTzUHWV+xnfeUBps7dytS50C4umh6d0+hZmEbPonQ6piZovqCNUfEXkc8UGxNFqX8eAODwsVN8vM03RLSu4gDLN+4BICMl3vdGUJhOz8J0khJighm2BEDFX0QCltwulkE9OjGoRyfOnTvH7gPHfRPHlQdYtmEPc1fX4gEKOiXTsyiN0sJ0uuWlEBOt+YJQo+IvIk3i8XjolN6OTuntuKpfHmfr6qisPeyfLzjAzKVVzFi8nZhoLyV5vvmC0sJ08jom4dUQUdCp+ItIs4jyeumam0LX3BS+NKyIE6fOsLHqE9ZV+IaJXp29hVfZQnK7GHp0TmNI7xwKMtqR3j4+2KFHJBV/EWkR8bHR9OmaQZ+uGQAcOHySj7f55grWb9vP0o93A5CV3s5/o1ka3QvSQnoxunCiLItIq0hLjmNor2yG9srm3LlzHD8L81dUsa7yAPPW7uD9FdVEeT0U5bS/MMlclJNMlFdLULQEFX8RaXUej4fO2cm0G1TA2EEFnD5Tx5aag/7J4/28Mb+C1+dXkBAXRfeCNEr98wUd03RJaXMJqPibWXtgITDeOVdpZmOAp4AE4GXn3KP+fmXAc0B7YC5wv3PuTItELiJhIybaS/fOaXTvnMbNI7ty5PhpNmw7wEcVvjeDlZv2AtChfTylRbqktDlcsvib2WDgWaDE/zgBmAiMBKqA6WY2zjk3A5gE3O2cW2xmzwP3AL9rqeBFJDwlJcQwoHtHBnTv6Luk9JPjrK/wXUXU4JLSrGT/EFEaxXmpxERriChQgZz534NvL97ze/UOAjY55yoAzGwScIuZrQcSnHOL/f1eAB5HxV9ELoPH46FTWjs6pbVjdL1LSs8vTPfO0u28tXgbsdFeSvJT6VmYTq+idHIzEzVE9DkuWfydc3cDmNn5phygtl6XWiDvc9pFRJpN/UtKvzysiOMnz+DOr1JauZ9XZm/mldmQkhjb4K7jtGStUlpfUyZ8vUD9TXY9QN3ntDeKfy/KNi0zMznYIYQU5eMi5aKh5spHQV4a1wwtAmDPgeOs3rSblRv3sHrTHhat2+Xrk5VMWUkm5SUd6dWlA/EheElpa74+mvKvrway6z3OAnZ8TnujaAP38KJ8XKRcNNSS+ehblE7fonTqxpZQvfsI6yr3s65iP28tqOSNuVsbrFJanJtCXmYiye1iWySWQDXDBu6N0pTivwQwMysGKoAJwETn3DYzO2Fmw5xzC4A7gBlN+PkiIs3C6/FQ0CmZgk7JjBvsX6W0+uCF+YKpc7de6JuSFEt+ZhJ5mUnkdUwkLzOJ7A6JYTuJ3Oji75w7YWZ3AlOAeOAtYLL/6duBZ/2Xhq4Anm6mOEVELltsTJTvnoGidBgNh46domrXEap2H6F6j+/rveVVnDnrG32I8nrISm9HXsck8jJ9bwh5mUmkt49r85PJnnPnQmaIpRCo0LBPeFE+LlIuGgrVfJytq2PX/uNU7/G9KdTsOUrV7iPsO3TiQp+EuGjyMxPJ7Zh04dNCbmbiZS1N0QzDPkVAZaDHhd6Mh4hIEEV5veRkJJKTkcigHp0utB87cYaavUeo3n2E6j1HqdpzhEUf7WT2qbMX+mSkxPuHjZLI939a6JiWEJJLVKj4i4gEoF18NN3yUumWl3qh7dy5c+w7dILq3UcvDBtV7T7Cmi37qPOPqsREe8npkEhex0TyM5MufFponxjcCWYVfxGRJvJ4PGSkJJCRkkBZt4wL7afPnGXH3mMX3hCqdx/ho637WbB254U+7dvF+OcSkujcKZnrRiS2auwq/iIizSwmOurCXsj1HTp6yv+GcNQ/fHSE2StrOH2mjrycFPLTE1otRhV/EZFW0j4xlp6JvjuOz6urO8fhY6coLspo1Qnw0JuFEBGJIF6vh5Sk1l96QsVfRCQCqfiLiEQgFX8RkQik4i8iEoFU/EVEIpCKv4hIBAql6/yjwHfZU1sXDv+G5qR8XKRcNKR8NNSUfNQ7Jqoxx4XSqp5XAvOCHYSISBs1HJgfaOdQKv5xwEB8e/+evURfERHxicK3i+Iy4GSgB4VS8RcRkVaiCV8RkQik4i8iEoFU/EVEIpCKv4hIBFLxFxGJQCr+IiIRSMVfRCQChdLyDm2SmbUHFgLjnXOVZjYGeApIAF52zj0a1ABbkZk9BnzV/3C6c+77EZ6PnwBfAc4BzzvnnorkfACY2X8AGc65O82sDHgOaA/MBe53zp0JaoCtxMxmAx2B0/6m+4CuwKNADPAr59wzLRmDzvwvg5kNxnc7dYn/cQIwEbge6AEMNLNxwYuw9fiL2ligHCgD+pvZ14jcfIwErgL6AAOA75hZXyI0HwBmdjXwjXpNk4BvO+dKAA9wT1ACa2Vm5sFXM/o658qcc2VANfBv+Ja5KQPuNbOeLRmHiv/luQd4ENjhfzwI2OScq/CfwUwCbglWcK2sFnjIOXfKOXca+BjfCzwi8+Gc+wAY7f93d8T3KTuVCM2HmaXjK24/8z/uDCQ45xb7u7xAhOQCMP9/Z5rZajP7NjAGmOWc2++cOwpMxvepscVo2OcyOOfuBjA7//+SHHxF8LxaIK+VwwoK59y689+bWTd8wz+/JkLzAeCcO21mjwMPA68Swa8P4A/Aj4B8/+NIzkUa8D7wHXxDPHOAl/mf+RjUkkHozL95efGN757nAeqCFEtQmFkp8C7wCLCVCM+Hc+4xIBNf0SshAvNhZncDVc659+s1R+zfinNukXPu6865g865vcDzwE9o5XzozL95VeNbXe+8LC4OCYU9MxsGTAG+55x7yT/uHZH5MLPuQLxzbpVz7piZvYbvY3z9FWsjJR+3AtlmtgpIB5LwFbpIfW1cCcTVezP0AJW0cj5U/JvXEsDMrBioACbgm+ALe2aWD0wDbnXOzfI3R2w+gC7A4/4/9HP4Jnn/ADwZaflwzl1z/nszuxMY5Zz7ppl9ZGbDnHMLgDuAGcGKsZWlAj8xs6H4hn2+AfwDMMnMMoGjwM3AvS0ZhIZ9mpFz7gRwJ76z3/XABnwTN5HgYSAeeMrMVvnP8u4kQvPhnHsLmA6sBJYDC51zLxGh+fgMtwO/NLMN+D4NPB3keFqFc+5vNHxtTPS/Af4ImA2sAv7inFvaknFoPX8RkQikM38RkQik4i8iEoFU/EVEIpCKv4hIBFLxFxGJQCr+IiIRSMVfRCQCqfiLiESg/w9JSr+9kqaHjgAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制不同K对应的聚类的性能，找到最佳模型／参数（分数最高）\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-',label = 'CH_scores')\n",
    "\n",
    "### 最佳超参数\n",
    "index = np.unravel_index(np.argmax(CH_scores, axis=None), len(CH_scores))\n",
    "Best_K = Ks[ index[0]]\n",
    "\n",
    "print(Best_K)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "本例中最佳K=5，但实际标签分类有10种。"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
